**06/23/2021
**file to bring in CPD FOIA salary files received in 2021 & assign unique IDs
 

**Housekeepig
local ver_suff _9_21
clear all
local ver_suff _9_21
clear all
if "`c(username)'"=="taehokim" |"`c(username)'"=="Taeho Kim"{ 
	cd "/Users/`c(username)'/Dropbox (Penn Law)/misconduct/do_files`ver_suff'"
} 
else{
	cd "/Users/`c(username)'/Dropbox/misconduct/do_files`ver_suff'"
}
set more off

//later pass through profiles dataset to fuzzy match 
local profiles_match `1'

*******************
*BRING IN SALARY DATA
*******************

//run the following to bring in salary data

/* loop thru each of the Sheets beginning with Sheet4 */
/*
forvalues i=1(1)9 {
	**Read from 2004-2012
  local year = 2003+`i'
  /* get the data beginning in cell B2 */     
  import excel using ../FOIA/FOIA_salary_051821/CPD_FOIA_REQ_P1-052721.xlsx, ///
         sheet(`year') firstrow clear 
  generate year = `year'   
  
  /* append and save data */
  if `i'==1 {
    save ../processed_data`ver_suff'/FOIA_processing/salary, replace
  }
  else {
    append using ../processed_data`ver_suff'/FOIA_processing/salary.dta
    save ../processed_data`ver_suff'/FOIA_processing/salary, replace
  }
  
}

forvalues i=1(1)9 {
	**Read from 2013-2021
  local year = 2012+`i'
  /* get the data beginning in cell B2 */     
  import excel using ../FOIA/FOIA_salary_051821/CPD_FOIA_REQ_P2-052721.xlsx, ///
         sheet(`year') firstrow clear 
  generate year = `year'   
  
  /* append and save data */
 
 
  append using ../processed_data`ver_suff'/FOIA_processing/salary.dta
  save ../processed_data`ver_suff'/FOIA_processing/salary, replace
 
  
}

use ../processed_data`ver_suff'/FOIA_processing/salary, clear
 **Drop records missing name data
drop if LastName == ""

**Back out birthyear from difference between age at hire and date of hire
gen birthdate = OriginalHireDate9161-AgeatHire*365
format %td birthdate 
gen birthyear = year(birthdate)
 
**Record name 
gen fname = trim(FirstName)
gen lname = trim(LastName)
gen mi = trim(MI)

**Record appointment date
gen appoint_dt = OriginalHireDate9161 
gen appoint_yr = year(appoint_dt)
gen appoint_m = month(appoint_dt)
gen appoint_d = day(appoint_dt)

**Standardize gender variable
rename Gender sex

//some obs have variations of last names: deflate last names
gen lname_mod = subinstr(lname, "-", "", .)
replace lname_mod = subinstr(lname_mod, " ", "", .)
replace lname_mod = subinstr(lname_mod, ".", "", .)

**Save intermediate version of data
save ../processed_data`ver_suff'/FOIA_processing/salary, replace     /* final save of the data */
*/

use ../processed_data`ver_suff'/FOIA_processing/salary, clear

**Deduplicate with respect to matching variables
duplicates drop fname lname_mod appoint_yr appoint_m appoint_d birthyear , force
keep fname lname lname_mod mi appoint_yr appoint_m appoint_d birthyear sex

sort fname lname_mod mi appoint_yr appoint_m appoint_d birthyear 

 **Generate preliminary ID
gen pid = _n 

**Create name soundexes
gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)
 
 //if missing birthyear or appointed dates give data from obs that share the same names
sort fname lname appoint_yr appoint_m appoint_d birthyear
replace pid = pid[_n-1] if fname == fname[_n-1] & lname_mod == lname_mod[_n-1] & appoint_yr == appoint_yr[_n-1]& appoint_m == appoint_m[_n-1]& appoint_d == appoint_d[_n-1] & birthyear == .
replace pid = pid[_n-1] if fname == fname[_n-1] & lname_mod == lname_mod[_n-1] & birthyear == birthyear[_n-1] & appoint_yr == .
replace pid = pid[_n-1] if fname == fname[_n-1] & lname_mod == lname_mod[_n-1] & birthyear == . & appoint_yr == .

**Save dataset with preliminary IDs
save ../processed_data`ver_suff'/FOIA_processing/salary_pid, replace
clear
 



*******************
*SET UP LIST OF DATA TO STORE MATCHED IDS
*******************
clear 

set obs 1
gen u_pid = -1
gen pid = -1

save ../processed_data`ver_suff'/FOIA_processing/pid_appending, replace
clear


*******************
*FUZZY MATCHING
*******************

use ../processed_data`ver_suff'/FOIA_processing/salary_pid, clear // start with 23,353

**Perfect name, appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 1 //13,228 so far

**Perfect name, appoint date match, 1 year birth year gap
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 1 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 2 //13,474 so far

**Perfect name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 3  

**Perfect first name, soundex last name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 4  

**Soundex both names, middle initial, and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 5  

**Soundex both names, no middle initial, and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 6  

**first name, no middle initial, no last name and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 7  
 
**first name, last name, off in appoint_yr 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 1 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 9  


**first name, last name, off in appoint_mo 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 1 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 10  


**first name, last name, off in appoint_day 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 1 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 11  


**first name, last name, off in birth year 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 1 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 12  


**first name, last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 13  
 
**soundex first name, soundex last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 14  

 
 
*******************
*KEEP BEST MATCHES
******************* 

use ../processed_data`ver_suff'/FOIA_processing/pid_appending.dta, clear

//drop the less good matches if individuals in awards data are matched with more than one in profiles

sort pid match_no u_pid
quietly by pid:  gen dup = cond(_N==1,0,_n)
sort dup
tab dup

drop if dup>1 
drop dup 
sort pid match_no

tempfile best_matches
save `best_matches'

*******************
*MERGE DE-DUPLICATED SALARY DATA WITH MATCHINGS
******************* 

use ../processed_data`ver_suff'/FOIA_processing/salary_pid, clear

merge m:1 pid using `best_matches', keep(1 3)  //20,010 of 23353 are matched  

 
